# import cleaned data
s44 <- read.csv("//Users/khushir/Documents/Class_Notes/Time Series/Project/store_44.csv", stringsAsFactors = FALSE)
s45 <- read.csv("//Users/khushir/Documents/Class_Notes/Time Series/Project/store_45.csv", stringsAsFactors = FALSE)
s47 <- read.csv("//Users/khushir/Documents/Class_Notes/Time Series/Project/store_47.csv", stringsAsFactors = FALSE)
# drop unnecessary columns
s44 <- s44[ , c("date","sales"), drop = FALSE]
s45 <- s45[ , c("date","sales"), drop = FALSE]
s47 <- s47[ , c("date","sales"), drop = FALSE]
library(stats)
library(TSA)
##
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
##
## acf, arima
## The following object is masked from 'package:utils':
##
## tar
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.Arima TSA
## plot.Arima TSA
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
s44$date <- as.Date(s44$date, format="%Y-%m-%d")
s45$date <- as.Date(s45$date, format="%Y-%m-%d")
s47$date <- as.Date(s47$date, format="%Y-%m-%d")
# splitting test and train
train44 <- s44[1:1674, ]
test44 <- s44[(nrow(s44) - 13):nrow(s44), ]
train45 <- s45[1:1674, ]
test45 <- s45[(nrow(s45) - 13):nrow(s45), ]
train47 <- s47[1:1674, ]
test47 <- s47[(nrow(s47) - 13):nrow(s47), ]
Store 44
train44.ts <- ts(train44$sales, frequency = 365)
test44.ts <- ts(test44$sales, frequency = 365)
# plot the time series
plot(train44, type = "l", main = "Time Series Plot of train data (Store 44)", xlab = "Date", ylab = "Sales")

# Plot ACF
acf(train44$sales, main = "ACF of Sales, train data (Store 44)", ylab = "Autocorrelation", xlab = "Lag")

# Plot PACF
pacf(train44$sales, main='PACF of Sales, train data (Store 44)')

# applying SARIMA(pdq)(PDQ)
sarima44 <- auto.arima(train44$sales, seasonal = TRUE, trace = TRUE, approximation = TRUE)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,1,2) with drift : 35575.12
## ARIMA(0,1,0) with drift : 36378.9
## ARIMA(1,1,0) with drift : 36344.69
## ARIMA(0,1,1) with drift : 35950.38
## ARIMA(0,1,0) : 36376.9
## ARIMA(1,1,2) with drift : 35599.28
## ARIMA(2,1,1) with drift : 35593.01
## ARIMA(3,1,2) with drift : 35565.19
## ARIMA(3,1,1) with drift : 35583.49
## ARIMA(4,1,2) with drift : Inf
## ARIMA(3,1,3) with drift : Inf
## ARIMA(2,1,3) with drift : 35461.12
## ARIMA(1,1,3) with drift : 35569.39
## ARIMA(2,1,4) with drift : 35533.73
## ARIMA(1,1,4) with drift : 35533.85
## ARIMA(3,1,4) with drift : Inf
## ARIMA(2,1,3) : 35459.55
## ARIMA(1,1,3) : 35567.4
## ARIMA(2,1,2) : 35573.6
## ARIMA(3,1,3) : Inf
## ARIMA(2,1,4) : 35531.88
## ARIMA(1,1,2) : 35597.74
## ARIMA(1,1,4) : 35531.91
## ARIMA(3,1,2) : 35563.56
## ARIMA(3,1,4) : Inf
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(2,1,3) : 35479.18
##
## Best model: ARIMA(2,1,3)
summary(sarima44)
## Series: train44$sales
## ARIMA(2,1,3)
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3
## -0.4357 -0.826 -0.1612 0.1283 -0.7296
## s.e. 0.0181 0.041 0.0345 0.0640 0.0282
##
## sigma^2 = 94414628: log likelihood = -17733.56
## AIC=35479.13 AICc=35479.18 BIC=35511.66
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 166.8022 9699.29 7602.904 -Inf Inf 0.7602784 0.1542414
checkresiduals(sarima44)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,3)
## Q* = 942.38, df = 5, p-value < 2.2e-16
##
## Model df: 5. Total lags used: 10
# Forecasting
forecast44 <- forecast(sarima44, h = 14)
plot(forecast44, main = "SARIMA Forecast from 2nd to 15th Aug, 2017 (Store 44)", xlab = "day", ylab = "Sales")
abline(h = mean(train44$sales), col = "red", lty = 2)

# Plot the test values
plot(test44, type = "l", col = "black", lwd = 2, ylim = range(c(test44.ts, forecast44$mean)),
main = "Comparison of Forecasted and Actual Values (Store 44)", xlab = "Day", ylab = "Sales")
# Add the forecasted values to the plot
# lines(x = seq(length(test44.ts) + 1, length(test44.ts) + length(forecast44$mean)),
# y = forecast44$mean, col = "blue", lwd = 2)
lines(x = test44$date,
y = forecast44$mean, col = "blue", lwd = 2)
# Add legend
legend("topright", legend = c("Actual", "Forecasted"), col = c("black", "blue"), lty = 1, lwd = 2)
# Add mean of training data as a reference line
abline(h = mean(train44$sales), col = "red", lty = 2)

# Plot the test values
plot(test44, type = "l", col = "black", lwd = 2, ylim = range(c(test44, forecast44$upper)),
main = "Comparison of Forecasted and Actual Values (Store 44)", xlab = "Day", ylab = "Sales")
# Add the forecasted values to the plot
lines(x = test44$date, y = forecast44$mean, col = "orange", lwd = 2)
lines(x = test44$date, y = forecast44$upper[1:14], col = "blue", lwd = 2)
lines(x = test44$date, y = forecast44$lower[1:14], col = "purple", lwd = 2)
# Add legend
legend("topright", legend = c("Actual", "Forecasted Mean", "Forecasted Upper", "Forecasted Lower"), col = c("black", "orange", "blue", "purple"), lty = 1, lwd = 2)
abline(h = mean(train44$sales), col = "red", lty = 2)

# Install and load the Metrics package if you haven't already
library(Metrics)
##
## Attaching package: 'Metrics'
## The following object is masked from 'package:forecast':
##
## accuracy
# Calculate RMSE
rmse44 <- rmse(forecast44$mean, test44$sales)
rmse44
## [1] 9099.985
Store 45
train45.ts <- ts(train45$sales, frequency = 365)
test45.ts <- ts(test45$sales, frequency = 365)
# plot the time series
plot(train45, type = "l", main = "Time Series Plot of train data (Store 45)", xlab = "Date", ylab = "Sales")

# Plot ACF
acf(train45$sales, main = "ACF of Sales, train data (Store 45)", ylab = "Autocorrelation", xlab = "Lag")

# Plot PACF
pacf(train45$sales, main='PACF of Sales, train data (Store 45)')

# applying SARIMA(pdq)(PDQ)
sarima45 <- auto.arima(train45$sales, seasonal = TRUE, trace = TRUE, approximation = TRUE)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,1,2) with drift : 35313.28
## ARIMA(0,1,0) with drift : 36041
## ARIMA(1,1,0) with drift : 36020.72
## ARIMA(0,1,1) with drift : 35768.77
## ARIMA(0,1,0) : 36039.01
## ARIMA(1,1,2) with drift : 35384.8
## ARIMA(2,1,1) with drift : 35351.28
## ARIMA(3,1,2) with drift : 35328.81
## ARIMA(2,1,3) with drift : 35097.09
## ARIMA(1,1,3) with drift : 35375.44
## ARIMA(3,1,3) with drift : 35240.46
## ARIMA(2,1,4) with drift : Inf
## ARIMA(1,1,4) with drift : 35348.01
## ARIMA(3,1,4) with drift : Inf
## ARIMA(2,1,3) : Inf
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(2,1,3) with drift : Inf
## ARIMA(3,1,3) with drift : 35259.92
##
## Best model: ARIMA(3,1,3) with drift
summary(sarima45)
## Series: train45$sales
## ARIMA(3,1,3) with drift
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 ma3 drift
## 0.1248 0.2553 -0.4213 -0.5945 -0.7705 0.5113 14.6809
## s.e. 0.0511 0.0432 0.0351 0.0473 0.0449 0.0318 31.2816
##
## sigma^2 = 82746787: log likelihood = -17621.92
## AIC=35259.83 AICc=35259.92 BIC=35303.21
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 18.89377 9074.764 6702.101 -Inf Inf 0.7509208 -0.04092634
checkresiduals(sarima45)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,3) with drift
## Q* = 407.85, df = 4, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 10
# Forecasting
forecast45 <- forecast(sarima45, h = 14)
plot(forecast45, main = "SARIMA Forecast from 2nd to 15th Aug, 2017 (Store 45)", xlab = "day", ylab = "Sales")
abline(h = mean(train45$sales), col = "red", lty = 2)

# Plot the test values
plot(test45, type = "l", col = "black", lwd = 2, ylim = range(c(test45.ts, forecast45$mean)),
main = "Comparison of Forecasted and Actual Values (Store 45)", xlab = "Day", ylab = "Sales")
# Add the forecasted values to the plot
# lines(x = seq(length(test45.ts) + 1, length(test45.ts) + length(forecast45$mean)),
# y = forecast45$mean, col = "blue", lwd = 2)
lines(x = test45$date,
y = forecast45$mean, col = "blue", lwd = 2)
# Add legend
legend("topright", legend = c("Actual", "Forecasted"), col = c("black", "blue"), lty = 1, lwd = 2)
# Add mean of training data as a reference line
abline(h = mean(train45$sales), col = "red", lty = 2)

# Plot the test values
plot(test45, type = "l", col = "black", lwd = 2, ylim = range(c(test45, forecast45$upper)),
main = "Comparison of Forecasted and Actual Values (Store 45)", xlab = "Day", ylab = "Sales")
# Add the forecasted values to the plot
lines(x = test45$date, y = forecast45$mean, col = "orange", lwd = 2)
lines(x = test45$date, y = forecast45$upper[1:14], col = "blue", lwd = 2)
lines(x = test45$date, y = forecast45$lower[1:14], col = "purple", lwd = 2)
# Add legend
legend("topright", legend = c("Actual", "Forecasted Mean", "Forecasted Upper", "Forecasted Lower"), col = c("black", "orange", "blue", "purple"), lty = 1, lwd = 2)
abline(h = mean(train45$sales), col = "red", lty = 2)

# Install and load the Metrics package if you haven't already
library(Metrics)
# Calculate RMSE
rmse45 <- rmse(forecast45$mean, test45$sales)
rmse45
## [1] 7984.638
Store 47
train47.ts <- ts(train47$sales, frequency = 365)
test47.ts <- ts(test47$sales, frequency = 365)
# plot the time series
plot(train47, type = "l", main = "Time Series Plot of train data (Store 47)", xlab = "Date", ylab = "Sales")

# Plot ACF
acf(train47$sales, main = "ACF of Sales, train data (Store 47)", ylab = "Autocorrelation", xlab = "Lag")

# Plot PACF
pacf(train47$sales, main='PACF of Sales, train data (Store 47)')

# applying SARIMA(pdq)(PDQ)
sarima47 <- auto.arima(train47$sales, seasonal = TRUE, trace = TRUE, approximation = TRUE)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,1,2) with drift : 34980.23
## ARIMA(0,1,0) with drift : 35734.26
## ARIMA(1,1,0) with drift : 35720.5
## ARIMA(0,1,1) with drift : 35469.22
## ARIMA(0,1,0) : 35732.26
## ARIMA(1,1,2) with drift : 35040.2
## ARIMA(2,1,1) with drift : 35001.59
## ARIMA(3,1,2) with drift : 34982.35
## ARIMA(2,1,3) with drift : 34975.92
## ARIMA(1,1,3) with drift : 35036.46
## ARIMA(3,1,3) with drift : 34906.46
## ARIMA(4,1,3) with drift : Inf
## ARIMA(3,1,4) with drift : Inf
## ARIMA(2,1,4) with drift : Inf
## ARIMA(4,1,2) with drift : Inf
## ARIMA(4,1,4) with drift : Inf
## ARIMA(3,1,3) : 34904.71
## ARIMA(2,1,3) : 34974.93
## ARIMA(3,1,2) : 34981.23
## ARIMA(4,1,3) : Inf
## ARIMA(3,1,4) : Inf
## ARIMA(2,1,2) : 34978.69
## ARIMA(2,1,4) : Inf
## ARIMA(4,1,2) : Inf
## ARIMA(4,1,4) : Inf
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(3,1,3) : 34925.65
##
## Best model: ARIMA(3,1,3)
summary(sarima47)
## Series: train47$sales
## ARIMA(3,1,3)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 ma3
## 0.1721 0.1803 -0.3921 -0.6335 -0.6993 0.4738
## s.e. 0.0558 0.0468 0.0376 0.0518 0.0532 0.0326
##
## sigma^2 = 67800206: log likelihood = -17455.79
## AIC=34925.58 AICc=34925.65 BIC=34963.54
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 113.6987 8216.854 6214.637 -Inf Inf 0.7762624 -0.03043194
checkresiduals(sarima47)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,3)
## Q* = 546.98, df = 4, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 10
# Forecasting
forecast47 <- forecast(sarima47, h = 14)
plot(forecast47, main = "SARIMA Forecast from 2nd to 15th Aug, 2017 (Store 47)", xlab = "day", ylab = "Sales")
abline(h = mean(train47$sales), col = "red", lty = 2)

# Plot the test values
plot(test47, type = "l", col = "black", lwd = 2, ylim = range(c(test47.ts, forecast47$mean)),
main = "Comparison of Forecasted and Actual Values (Store 47)", xlab = "Day", ylab = "Sales")
# Add the forecasted values to the plot
lines(x = test47$date,
y = forecast47$mean, col = "blue", lwd = 2)
# Add legend
legend("topright", legend = c("Actual", "Forecasted"), col = c("black", "blue"), lty = 1, lwd = 2)
abline(h = mean(train47$sales), col = "red", lty = 2)

# Plot the test values
plot(test47, type = "l", col = "black", lwd = 2, ylim = range(c(test47, forecast47$upper)),
main = "Comparison of Forecasted and Actual Values (Store 47)", xlab = "Day", ylab = "Sales")
# Add the forecasted values to the plot
lines(x = test47$date, y = forecast47$mean, col = "orange", lwd = 2)
lines(x = test47$date, y = forecast47$upper[1:14], col = "blue", lwd = 2)
lines(x = test47$date, y = forecast47$lower[1:14], col = "purple", lwd = 2)
# Add legend
legend("topright", legend = c("Actual", "Forecasted Mean", "Forecasted Upper", "Forecasted Lower"), col = c("black", "orange", "blue", "purple"), lty = 1, lwd = 2)
abline(h = mean(train47$sales), col = "red", lty = 2)

# Install and load the Metrics package if you haven't already
library(Metrics)
# Calculate RMSE
rmse47 <- rmse(forecast47$mean, test47$sales)
rmse47
## [1] 7290.442